In this notebook, we are using the tmb_genomic.tsv file
generated from the 01-preprocess-data.Rmd script.
suppressPackageStartupMessages({
library(tidyverse)
library(scales)
})
# Detect the \.git\ folder. This will be in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from.
root_dir <- rprojroot::find_root(rprojroot::has_dir(\.git\))
analysis_dir <- file.path(root_dir, \analyses\, \tmb-vaf-longitudinal\)
input_dir <- file.path(analysis_dir, \input\)
# File path to results directory
results_dir <-
file.path(analysis_dir, \results\)
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# Input files
tmb_genomic_file <- file.path(results_dir, \tmb_vaf_genomic.tsv\)
palette_file <- file.path(root_dir, \figures\, \palettes\, \tumor_descriptor_color_palette.tsv\)
# File path to plots directory
plots_dir <-
file.path(analysis_dir, \plots\)
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
source(paste0(analysis_dir, \/util/function-create-barplot.R\))
source(paste0(analysis_dir, \/util/function-create-dumbbell-plot.R\))
source(paste0(root_dir, \/figures/scripts/theme.R\))
# Read and process tmb_genomic file
df_total <- readr::read_tsv(tmb_genomic_file, guess_max = 100000, show_col_types = FALSE) %>%
group_by(Kids_First_Participant_ID) %>%
mutate(cg_distinct = n_distinct(cancer_group) > 1) # to identify samples with different diagnosis across timepoints
# Are there any samples with both WGS and WXS?
df_total %>%
unique() %>%
arrange(Kids_First_Participant_ID, experimental_strategy) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(experimental_strategy_sum = str_c(experimental_strategy, collapse = \;\))
# There are, so let's remove these from downstream analyses.
df <- df_total %>%
filter(!experimental_strategy == \WXS\) %>%
dplyr::mutate(patient_id = paste(short_histology, Kids_First_Participant_ID, sep = \_\)) %>%
distinct(cancer_group, .keep_all = TRUE) %>%
summarise(cg_sum = str_c(cancer_group, collapse = \
# A tibble: 37 × 2
cg_sum n
<chr> <int>
1 High-grade glioma 26116
2 Meningioma,Medulloblastoma 3713
3 Atypical Teratoid Rhabdoid Tumor 2944
4 Rosai-Dorfman disease,Sarcoma 2819
5 Diffuse midline glioma 2012
6 Medulloblastoma 1222
7 Ependymoma 906
8 Low-grade glioma 737
9 Chordoma 477
10 CNS Embryonal tumor 200
# ℹ 27 more rows
# Let's summarize cancer groups with < 10 bs_samples as Other and use this for visualization purposes
cg_sum_df <- df %>%
count(cg_sum) %>%
dplyr::mutate(cg_sum_n = glue::glue(\{cg_sum} (N={n})\))
df <- df %>%
left_join(cg_sum_df, by = c(\cg_sum\)) %>%
mutate(cg_plot = case_when(n < 10 ~ \Other\,
TRUE ~ cg_sum),
cg_kids_id = paste(cg_sum, Kids_First_Participant_ID, sep = \_\))
# How many bs_samples per cg_plot?
print(df %>% count(cg_plot) %>% arrange(desc(n)))
# A tibble: 37 × 2
cg_plot n
<chr> <int>
1 High-grade glioma 26116
2 Meningioma,Medulloblastoma 3713
3 Atypical Teratoid Rhabdoid Tumor 2944
4 Rosai-Dorfman disease,Sarcoma 2819
5 Diffuse midline glioma 2012
6 Medulloblastoma 1222
7 Ependymoma 906
8 Low-grade glioma 737
9 Chordoma 477
10 CNS Embryonal tumor 200
# ℹ 27 more rows
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE)
# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names
# length(unique(df$Kids_First_Participant_ID))
We will explore TMB per Kids_First_Participant_ID over
time by creating stacked barplots.
# Define parameters for function
ylim = max(df$tmb)
x_value <- df$Kids_First_Participant_ID
# Re-order df
f <- c(\Second Malignancy\, \Unavailable\, \Deceased\, \Recurrence\, \Progressive\, \Diagnosis\) # Level df by timepoints
df_plot <- df %>%
dplyr:::mutate(tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f))
# Run function
fname <- paste0(plots_dir, \/\, \TMB-genomic-total.pdf\)
print(fname)
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-total.pdf\
p <- create_stacked_barplot(tmb_df = df_plot, ylim = ylim, x = x_value, palette = palette)
pdf(file = fname, width = 22, height = 6)
print(p)
dev.off()
png
2
Attention: Hypermutant TMB defined as ≥10 Mb, and Ultrahypermutant TMB defined as ≥100 mutations/Mb in pediatric brain tumors (https://pubmed.ncbi.nlm.nih.gov/29056344/).
Here, we notice that there are samples with high TMB (hyper-mutant samples). Next, we will exclude these samples (threshold >= 10) from downstream analysis. Attention is needed in cases with high number of mutations in only one timepoint as this will lead to un-matched longitudinal samples. We will also remove those so we always have matched longitudinal samples.
# Filter df and remove any samples with single timepoints
df_plot_filter <- df %>%
filter(!tmb >= 10) %>%
unique() %>%
arrange(Kids_First_Participant_ID, tumor_descriptor) %>%
group_by(Kids_First_Participant_ID) %>%
dplyr::summarise(tumor_descriptor_sum = str_c(tumor_descriptor, collapse = \;\)) %>%
filter(!tumor_descriptor_sum %in% c(\Diagnosis\, \Progressive\, \Recurrence\, \Second Malignancy\, \Unavailable\, \Deceased\, \Progressive;Progressive\)) %>%
dplyr::left_join(df, by = c(\Kids_First_Participant_ID\, \tumor_descriptor_sum\)) %>%
mutate(tumor_descriptor = factor(tumor_descriptor),
tumor_descriptor = fct_relevel(tumor_descriptor, f)) %>%
drop_na(tmb)
# length(unique(df_plot_filter$Kids_First_Participant_ID))
# Define parameters for function
ylim <- max(df_plot_filter$tmb)
df_plot_filter <- df_plot_filter
x_value <- df_plot_filter$cg_kids_id
# Run function
fname <- paste0(plots_dir, \/\, \TMB-genomic-no-hypermutants.pdf\)
print(fname)
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/TMB-genomic-no-hypermutants.pdf\
p <- create_stacked_barplot(tmb_df = df_plot_filter, ylim = ylim, x = x_value, palette = palette)
pdf(file = fname, width = 25, height = 10)
print(p)
dev.off()
png
2
We will explore TMB per cancer group over time by creating dumbbell plots. We classified by using cancer types with the highest number of samples (High- and Low-grade gliomas) versus any other cancer groups.
# How many bs_samples per kids_id and cancer group?
# print(table(df_plot_filter$cg_plot))
print(df_plot_filter %>%
count(cg_plot, Kids_First_Participant_ID))
# A tibble: 51 × 3
cg_plot Kids_First_Participant_ID n
<chr> <chr> <int>
1 Adamantinomatous Craniopharyngioma PT_YK7AD0KK 19
2 Atypical Teratoid Rhabdoid Tumor PT_DVXE38EX 198
3 Atypical Teratoid Rhabdoid Tumor PT_HE8FBFNA 16
4 Atypical Teratoid Rhabdoid Tumor PT_VTG1S395 54
5 CNS Embryonal tumor PT_BRVGRXQY 87
6 CNS Embryonal tumor,Fibromyxoid lesion PT_98QMQZY7 68
7 Diffuse midline glioma PT_5NS35B66 93
8 Diffuse midline glioma PT_JNEV57VK 456
9 Diffuse midline glioma PT_JSFBMK5V 94
10 Diffuse midline glioma PT_NK8A49X5 255
# ℹ 41 more rows
# Dumbbell plot per cancer group
cancer_groups <- unique(as.character(df_plot_filter$cg_plot))
cancer_groups <- sort(cancer_groups, decreasing = FALSE)
print(cancer_groups)
[1] \Adamantinomatous Craniopharyngioma\
[2] \Atypical Teratoid Rhabdoid Tumor\
[3] \CNS Embryonal tumor\
[4] \CNS Embryonal tumor
for (i in seq_along(cancer_groups)) {
print(i)
df_ct_sub <- df_plot_filter %>%
filter(cg_plot == cancer_groups [i])
if (i %in% c(3, 7, 8)) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 2
} else if (i == 2) {
print(cancer_groups [i])
# Define parameters for function
ylim <- 6
} else {
print(cancer_groups [i])
# Define parameters for function
ylim <- 4
}
# Name plots
fname <- paste0(plots_dir, \/\, cancer_groups[i], \-TMB-dumbbell\, \.pdf\)
print(fname)
# Run function
p <- create_dumbbell_ct(tmb_df = df_ct_sub,
ylim = ylim,
ct_id = cancer_groups[i],
palette = palette)
pdf(file = fname, width = 12, height = 8)
print(p)
dev.off()
}
[1] 1
[1] \Adamantinomatous Craniopharyngioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Adamantinomatous Craniopharyngioma-TMB-dumbbell.pdf\
[1] 2
[1] \Atypical Teratoid Rhabdoid Tumor\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Atypical Teratoid Rhabdoid Tumor-TMB-dumbbell.pdf\
[1] 3
[1] \CNS Embryonal tumor\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/CNS Embryonal tumor-TMB-dumbbell.pdf\
[1] 4
[1] \CNS Embryonal tumor
[1] 5
[1] \Diffuse midline glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Diffuse midline glioma-TMB-dumbbell.pdf\
[1] 6
[1] \Dysembryoplastic neuroepithelial tumor
[1] 7
[1] \Ependymoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Ependymoma-TMB-dumbbell.pdf\
[1] 8
[1] \Ependymoma
[1] 9
[1] \Ganglioglioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Ganglioglioma-TMB-dumbbell.pdf\
[1] 10
[1] \Glial-neuronal tumor
[1] 11
[1] \High-grade glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/High-grade glioma-TMB-dumbbell.pdf\
[1] 12
[1] \Low-grade glioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Low-grade glioma-TMB-dumbbell.pdf\
[1] 13
[1] \Low-grade glioma
[1] 14
[1] \Low-grade glioma
[1] 15
[1] \Medulloblastoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Medulloblastoma-TMB-dumbbell.pdf\
[1] 16
[1] \Medulloblastoma
[1] 17
[1] \Meningioma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Meningioma-TMB-dumbbell.pdf\
[1] 18
[1] \Neuroblastoma
[1] 19
[1] \Pilocytic astrocytoma
[1] 20
[1] \Schwannoma\
[1] \/home/rstudio/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Schwannoma-TMB-dumbbell.pdf\
Here, we want to explore the number of mutations per timepoint and biospecimen sample per patient case.
samples <- unique(as.character(df_plot_filter$Kids_First_Participant_ID))
print(samples)
[1] \PT_02J5CWN5\ \PT_0DWRY9ZX\ \PT_1ZAWNGWT\ \PT_2FVTD0WR\ \PT_2MZPGZN1\
[6] \PT_2YT37G8P\ \PT_37B5JRP1\ \PT_39H4JN6H\ \PT_3GYW6P6P\ \PT_3P3HARZ2\
[11] \PT_3R0P995B\ \PT_3T3VGWC6\ \PT_3VCS1PPF\ \PT_5NS35B66\ \PT_5ZPPR06P\
[16] \PT_62G82T6Q\ \PT_82MX6J77\ \PT_89XRZBSG\ \PT_962TCBVR\ \PT_98QMQZY7\
[21] \PT_99S5BPE3\ \PT_B5DQ8FF0\ \PT_BRVGRXQY\ \PT_BZCJMEX8\ \PT_CWXSP19D\
[26] \PT_CXT81GRM\ \PT_DFQAH7RS\ \PT_DNAJYFZT\ \PT_DVXE38EX\ \PT_FN4GEEFR\
[31] \PT_HE8FBFNA\ \PT_HHG37M6W\ \PT_JNEV57VK\ \PT_JSFBMK5V\ \PT_KMHGNCNR\
[36] \PT_NK8A49X5\ \PT_P571HTNK\ \PT_PF04R0BH\ \PT_PR4YBBH3\ \PT_QH9H491G\
[41] \PT_S2SQJVGK\ \PT_T4VN7ZRB\ \PT_TKWTTRQ7\ \PT_TP6GS00H\ \PT_TRZ1N1HQ\
[46] \PT_VTG1S395\ \PT_XA98HG1C\ \PT_XHYBZKCX\ \PT_YK7AD0KK\ \PT_Z4GS3ZQQ\
[51] \PT_ZMKMKCFQ\
for (i in seq_along(samples)) {
print(i)
tmb_sub <- df_plot_filter %>%
filter(Kids_First_Participant_ID == samples[i])
# Define parameters for function
ylim = max(df_plot_filter$mutation_count)
# Run function
p <- create_barplot_sample(tmb_df = tmb_sub,
ylim = ylim,
sid = samples[i],
palette = palette)
print(p)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] ggthemes_4.2.4 scales_1.2.1 lubridate_1.9.2 forcats_1.0.0
[5] stringr_1.5.0 dplyr_1.1.1 purrr_1.0.1 readr_2.1.4
[9] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.0 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] highr_0.10 bslib_0.4.2 compiler_4.2.3 pillar_1.9.0
[5] jquerylib_0.1.4 tools_4.2.3 bit_4.0.5 digest_0.6.31
[9] timechange_0.2.0 jsonlite_1.8.4 evaluate_0.20 lifecycle_1.0.3
[13] gtable_0.3.3 pkgconfig_2.0.3 rlang_1.1.0 cli_3.6.1
[17] parallel_4.2.3 yaml_2.3.7 xfun_0.38 fastmap_1.1.1
[21] withr_2.5.0 knitr_1.42 generics_0.1.3 vctrs_0.6.2
[25] sass_0.4.5 hms_1.1.3 bit64_4.0.5 rprojroot_2.0.3
[29] tidyselect_1.2.0 glue_1.6.2 R6_2.5.1 fansi_1.0.4
[33] vroom_1.6.1 rmarkdown_2.21 farver_2.1.1 tzdb_0.3.0
[37] magrittr_2.0.3 htmltools_0.5.5 colorspace_2.1-0 labeling_0.4.2
[41] utf8_1.2.3 stringi_1.7.12 munsell_0.5.0 cachem_1.0.7
[45] crayon_1.5.2